# load packages, installing if missing
if (!require(librarian)){
  install.packages("librarian")
  library(librarian)
}
librarian::shelf(
  dismo, dplyr, DT, GGally, ggplot2, here, htmltools, leaflet, mapview, purrr, raster, readr, rgbif, rgdal, rJava, sdmpredictors, sf, spocc, tidyr,
  caret,       # m: modeling framework
  dplyr, ggplot2 ,here, readr, 
  pdp,         # X: partial dependence plots
  ranger,      # m: random forest modeling
  rpart,       # m: recursive partition modeling
  rpart.plot,  # m: recursive partition plotting
  rsample,     # d: split train/test data
  skimr,       # d: skim summarize data table
  vip)         # X: variable importance)

select <- dplyr::select # overwrite raster::select
options(readr.show_col_types = FALSE)

# set random seed for reproducibility
set.seed(42)

# directory to store data
dir_data <- here("data/sdm")
dir.create(dir_data, showWarnings = F, recursive = T)

1a.EXPLORE

1) Species: Latrodectus mactans

2) Image:

Western black widow

3) Map:

obs_csv <- file.path(dir_data, "obs.csv")
obs_geo <- file.path(dir_data, "obs.geojson")
redo    <- TRUE

if (!file.exists(obs_geo) | redo){
  # get species occurrence data from GBIF with coordinates
  (res <- spocc::occ(
# query = 'Bradypus variegatus', # sloth
# query = 'Phengaris alcon', # Alcon blue
# query = 'Camponotus pennsylvanicus', # ant
  query = 'Latrodectus hesperus', # western black widow
  from  = 'gbif', has_coords = T,
  limit = 10000))
  
  # extract data frame from result
  df <- res$gbif$data[[1]] 
  readr::write_csv(df, obs_csv)
  
  # convert to points of observation from lon/lat columns in data frame
  obs <- df %>% 
    sf::st_as_sf(
      coords = c("longitude", "latitude"),
      crs = st_crs(4326)) %>% 
    select(prov, key) # save space (joinable from obs_csv)
  sf::write_sf(obs, obs_geo, delete_dsn=T)
}
obs <- sf::read_sf(obs_geo)
nrow(obs) # number of rows
## [1] 4936
# show points on map
# mapview::mapview(obs, map.types = "Stamen.Terrain")
mapview::mapview(obs, map.types = "Stamen.Watercolor")

4) Question:

3,539 observations in GBIF

5) Question:

No data points were removed for this terrestrial species because there are no known data points on water.

dir_env <- file.path(dir_data, "env")

# set a default data directory
options(sdmpredictors_datadir = dir_env)

# choosing terrestrial
env_datasets <- sdmpredictors::list_datasets(terrestrial = TRUE, marine = FALSE)

# show table of datasets
env_datasets %>% 
  select(dataset_code, description, citation) %>% 
  DT::datatable()
# choose datasets for a vector
env_datasets_vec <- c("WorldClim", "ENVIREM")

# get layers
env_layers <- sdmpredictors::list_layers(env_datasets_vec)
DT::datatable(env_layers)

6) Question:

What environmental layers did you choose as predictors? Can you find any support for these in the literature?

Altitude, temperate, terrain, and wetness. Western black widows are found in desert as well as mountainous regions. Although they prefer dark, dry conditions, they are able to survive in a variety of environments.

# choose layers after some inspection and perhaps consulting literature
env_layers_vec <- c("WC_alt",     # altitude
                    "WC_bio1",    # annual mean temp
                    "WC_bio2",    # mean diurnal temp range
                    "ER_tri",     # terrain roughness index
                    "ER_topoWet") # topographic wetness

# get layers
env_stack <- load_layers(env_layers_vec)

7) Rasters clipped to species range

# interactive plot layers, hiding all but first (select others)
# mapview(env_stack, hide = T) # makes the html too big for Github
plot(env_stack, nc=2)

obs_hull_geo  <- file.path(dir_data, "obs_hull.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")

if (!file.exists(obs_hull_geo) | redo){
  # make convex hull around points of observation
  obs_hull <- sf::st_convex_hull(st_union(obs))
  
  # save obs hull
  write_sf(obs_hull, obs_hull_geo)
}
obs_hull <- read_sf(obs_hull_geo)

# show points on map
mapview(
  list(obs, obs_hull))
if (!file.exists(env_stack_grd) | redo){
  obs_hull_sp <- sf::as_Spatial(obs_hull)
  env_stack <- raster::mask(env_stack, obs_hull_sp) %>% 
    raster::crop(extent(obs_hull_sp))
  writeRaster(env_stack, env_stack_grd, overwrite=T)  
}
env_stack <- stack(env_stack_grd)

# show map
# mapview(obs) + 
#   mapview(env_stack, hide = T) # makes html too big for Github
plot(env_stack, nc=2)

8) Map with pseudo-absence points

absence_geo <- file.path(dir_data, "absence.geojson")
pts_geo     <- file.path(dir_data, "pts.geojson")
pts_env_csv <- file.path(dir_data, "pts_env.csv")

if (!file.exists(absence_geo) | redo){
  # get raster count of observations
  r_obs <- rasterize(
    sf::as_Spatial(obs), env_stack[[1]], field=1, fun='count')
  
  # show map
  # mapview(obs) + 
  #   mapview(r_obs)
  
  # create mask for 
  r_mask <- mask(env_stack[[1]] > -Inf, r_obs, inverse=T)
  
  # generate random points inside mask
  absence <- dismo::randomPoints(r_mask, nrow(obs)) %>% 
    as_tibble() %>% 
    st_as_sf(coords = c("x", "y"), crs = 4326)
  
  write_sf(absence, absence_geo, delete_dsn=T)
}
absence <- read_sf(absence_geo)

# show map of presence, ie obs, and absence
mapview(obs, col.regions = "green") + 
  mapview(absence, col.regions = "gray")
if (!file.exists(pts_env_csv) | redo){

  # combine presence and absence into single set of labeled points 
  pts <- rbind(
    obs %>% 
      mutate(
        present = 1) %>% 
      select(present, key),
    absence %>% 
      mutate(
        present = 0,
        key     = NA)) %>% 
    mutate(
      ID = 1:n()) %>% 
    relocate(ID)
  write_sf(pts, pts_geo, delete_dsn=T)

  # extract raster values for points
  pts_env <- raster::extract(env_stack, as_Spatial(pts), df=TRUE) %>% 
    tibble() %>% 
    # join present and geometry columns to raster value results for points
    left_join(
      pts %>% 
        select(ID, present),
      by = "ID") %>% 
    relocate(present, .after = ID) %>% 
    # extract lon, lat as single columns
    mutate(
      #present = factor(present),
      lon = st_coordinates(geometry)[,1],
      lat = st_coordinates(geometry)[,2]) %>% 
    select(-geometry)
  write_csv(pts_env, pts_env_csv)
}
pts_env <- read_csv(pts_env_csv)

pts_env %>% 
  # show first 10 presence, last 10 absence
  slice(c(1:10, (nrow(pts_env)-9):nrow(pts_env))) %>% 
  DT::datatable(
    rownames = F,
    options = list(
      dom = "t",
      pageLength = 20))

9) Term Plots

pts_env %>% 
  select(-ID) %>% 
  mutate(
    present = factor(present)) %>% 
  pivot_longer(-present) %>% 
  ggplot() +
  geom_density(aes(x = value, fill = present)) + 
  scale_fill_manual(values = alpha(c("gray", "green"), 0.5)) +
  scale_x_continuous(expand=c(0,0)) +
  scale_y_continuous(expand=c(0,0)) +
  theme_bw() + 
  facet_wrap(~name, scales = "free") +
  theme(
    legend.position = c(1, 0),
    legend.justification = c(1, 0))

pts_env_csv <- file.path(dir_data, "pts_env.csv")

pts_env <- read_csv(pts_env_csv)
nrow(pts_env)
## [1] 9872
datatable(pts_env, rownames = F)

10) Plot of ggpairs

GGally::ggpairs(
  select(pts_env, -ID),
  aes(color = factor(present)))

1b.REGRESS

# setup model data
d <- pts_env %>% 
  select(-ID) %>%  # remove terms we don't want to model
  tidyr::drop_na() # drop the rows with NA values
nrow(d)
## [1] 9847

11) Linear model output with range of values

# fit a linear model
mdl <- lm(present ~ ., data = d)
summary(mdl)
## 
## Call:
## lm(formula = present ~ ., data = d)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.01232 -0.12666  0.09589  0.19588  0.76372 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.758e-01  4.892e-02  -3.593 0.000328 ***
## WC_alt       5.522e-06  6.946e-06   0.795 0.426666    
## WC_bio1      1.434e-02  7.628e-04  18.800  < 2e-16 ***
## WC_bio2      3.353e-02  1.513e-03  22.166  < 2e-16 ***
## ER_tri      -1.394e-03  1.693e-04  -8.233  < 2e-16 ***
## ER_topoWet  -5.572e-02  3.512e-03 -15.866  < 2e-16 ***
## lon         -5.525e-03  7.452e-05 -74.132  < 2e-16 ***
## lat          9.038e-03  1.930e-04  46.830  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2982 on 9839 degrees of freedom
## Multiple R-squared:  0.6446, Adjusted R-squared:  0.6443 
## F-statistic:  2549 on 7 and 9839 DF,  p-value: < 2.2e-16
y_predict <- predict(mdl, d, type="response")
y_true    <- d$present

range(y_predict)
## [1] -0.7637207  1.0563431
range(y_true)
## [1] 0 1

12) Generalized Linear Model (GLM)

# fit a generalized linear model with a binomial logit link function
mdl <- glm(present ~ ., family = binomial(link="logit"), data = d)
summary(mdl)
## 
## Call:
## glm(formula = present ~ ., family = binomial(link = "logit"), 
##     data = d)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2295  -0.0018   0.1086   0.3591   3.8939  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.738e+01  1.450e+00 -18.885  < 2e-16 ***
## WC_alt       2.016e-03  1.679e-04  12.006  < 2e-16 ***
## WC_bio1      4.335e-01  3.587e-02  12.086  < 2e-16 ***
## WC_bio2      6.486e-04  2.961e-02   0.022    0.983    
## ER_tri      -1.983e-02  2.342e-03  -8.467  < 2e-16 ***
## ER_topoWet  -2.361e-01  5.830e-02  -4.049 5.14e-05 ***
## lon         -1.595e-01  6.720e-03 -23.732  < 2e-16 ***
## lat          1.996e-01  2.310e-02   8.640  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 13650.8  on 9846  degrees of freedom
## Residual deviance:  3623.6  on 9839  degrees of freedom
## AIC: 3639.6
## 
## Number of Fisher Scoring iterations: 9
y_predict <- predict(mdl, d, type="response")

range(y_predict)
## [1] 2.220446e-16 9.945647e-01

13) GLM term plots

# show term plots
termplot(mdl, partial.resid = TRUE, se = TRUE, main = F, ylim="free")

14) Generalized Additive Model (GAM)

librarian::shelf(mgcv)

# fit a generalized additive model with smooth predictors
mdl <- mgcv::gam(
  formula = present ~ s(WC_alt) + s(WC_bio1) + 
    s(WC_bio2) + s(ER_tri) + s(ER_topoWet) + s(lon) + s(lat), 
  family = binomial, data = d)
summary(mdl)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## present ~ s(WC_alt) + s(WC_bio1) + s(WC_bio2) + s(ER_tri) + s(ER_topoWet) + 
##     s(lon) + s(lat)
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)   -280.2     1686.6  -0.166    0.868
## 
## Approximate significance of smooth terms:
##                 edf Ref.df  Chi.sq p-value    
## s(WC_alt)     7.340  7.793  82.218  <2e-16 ***
## s(WC_bio1)    6.175  7.111 199.136  <2e-16 ***
## s(WC_bio2)    6.294  7.305  90.872  <2e-16 ***
## s(ER_tri)     2.562  3.318  10.441   0.018 *  
## s(ER_topoWet) 1.001  1.003   0.004   0.959    
## s(lon)        5.869  6.294 243.877  <2e-16 ***
## s(lat)        5.539  5.907 258.732  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.854   Deviance explained = 81.9%
## UBRE = -0.74126  Scale est. = 1         n = 9847

15) GAM term plots

# show term plots
plot(mdl, scale=0)

16) Question:

Which GAM environmental variables, and even range of values, seem to contribute most towards presence (above 0 response) versus absence (below 0 response)?

17) Maxent (maximum entropy)

# load extra packages
librarian::shelf(
  maptools, sf)

mdl_maxent_rds <- file.path(dir_data, "mdl_maxent.rds")

# show version of maxent
if (!interactive())
  maxent()
## This is MaxEnt version 3.4.3
# get environmental rasters
# NOTE: the first part of Lab 1. SDM - Explore got updated to write this clipped environmental raster stack
env_stack_grd <- file.path(dir_data, "env_stack.grd")
env_stack <- stack(env_stack_grd)
plot(env_stack, nc=2)

18) Maxent variable contribution plot

# get presence-only observation points (maxent extracts raster values for you)
obs_geo <- file.path(dir_data, "obs.geojson")
obs_sp <- read_sf(obs_geo) %>% 
  sf::as_Spatial() # maxent prefers sp::SpatialPoints over newer sf::sf class

# fit a maximum entropy model
if (!file.exists(mdl_maxent_rds)){
  mdl <- maxent(env_stack, obs_sp)
  readr::write_rds(mdl, mdl_maxent_rds)
}
mdl <- read_rds(mdl_maxent_rds)

# plot variable contributions per predictor
plot(mdl)

19) Maxent term plots

# plot term plots
response(mdl)

# predict
y_predict <- predict(env_stack, mdl) #, ext=ext, progress='')

plot(y_predict, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')

20) Question:

Which Maxent environmental variables, and even range of values, seem to contribute most towards presence (closer to 1 response) and how might this differ from the GAM results?

1c.TREES

# options
options(
  scipen = 999,
  readr.show_col_types = F)
set.seed(42)

# graphical theme
ggplot2::theme_set(ggplot2::theme_light())

# paths
dir_data    <- here("data/sdm")
pts_env_csv <- file.path(dir_data, "pts_env.csv")

# read data
pts_env <- read_csv(pts_env_csv)
d <- pts_env %>% 
  select(-ID) %>%                   # not used as a predictor x
  mutate(
    present = factor(present)) %>%  # categorical response
  na.omit()                         # drop rows with NA
skim(d)
Data summary
Name d
Number of rows 9847
Number of columns 8
_______________________
Column type frequency:
factor 1
numeric 7
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
present 0 1 FALSE 2 1: 4928, 0: 4919

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
WC_alt 0 1 696.31 725.91 -70.00 167.00 404.00 1054.00 5153.00 ▇▂▁▁▁
WC_bio1 0 1 16.91 7.00 -4.80 11.60 16.60 23.10 30.60 ▁▃▇▆▆
WC_bio2 0 1 13.66 2.66 4.80 11.80 14.00 15.80 20.10 ▁▃▆▇▂
ER_tri 0 1 29.62 35.17 0.00 5.88 15.68 42.40 312.27 ▇▁▁▁▁
ER_topoWet 0 1 11.42 1.77 6.71 10.03 11.53 12.79 15.73 ▂▆▇▇▂
lon 0 1 -75.81 54.58 -130.32 -117.24 -104.71 -55.29 123.29 ▇▂▂▁▁
lat 0 1 25.52 20.30 -50.62 18.96 33.71 38.54 54.32 ▁▁▂▅▇

21) Tabular counts of 1 vs. 0 before and after split

# split data into training and testing
# create training set with 80% of full data
d_split  <- rsample::initial_split(d, prop = 0.8, strata = "present")
d_train  <- rsample::training(d_split)

# show number of rows present is 0 vs 1
table(d$present)
## 
##    0    1 
## 4919 4928
table(d_train$present)
## 
##    0    1 
## 3935 3942

22) rpart model output and plot, depth = 1

# run decision stump model
mdl <- rpart(
  present ~ ., data = d_train, 
  control = list(
    cp = 0, minbucket = 5, maxdepth = 1))
mdl
## n= 7877 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 7877 3935 1 (0.499555668 0.500444332)  
##   2) lon>=-96.97078 3436   13 0 (0.996216531 0.003783469) *
##   3) lon< -96.97078 4441  512 1 (0.115289349 0.884710651) *
# plot tree 
par(mar = c(1, 1, 1, 1))
rpart.plot(mdl)

23) rpart model output and plot, depth = default

# decision tree with defaults
mdl <- rpart(present ~ ., data = d_train)
mdl
## n= 7877 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 7877 3935 1 (0.499555668 0.500444332)  
##   2) lon>=-96.97078 3436   13 0 (0.996216531 0.003783469) *
##   3) lon< -96.97078 4441  512 1 (0.115289349 0.884710651)  
##     6) WC_bio1< 5.15 187   59 0 (0.684491979 0.315508021) *
##     7) WC_bio1>=5.15 4254  384 1 (0.090267983 0.909732017) *
rpart.plot(mdl)

# plot complexity parameter
plotcp(mdl)

# rpart cross validation results
mdl$cptable
##           CP nsplit rel error    xerror        xstd
## 1 0.86658196      0 1.0000000 1.0259212 0.011273787
## 2 0.01753494      1 0.1334180 0.1341804 0.005640348
## 3 0.01000000      2 0.1158831 0.1176620 0.005305076

24) rpart complexity parameter plot

# caret cross validation results
mdl_caret <- train(
  present ~ .,
  data       = d_train,
  method     = "rpart",
  trControl  = trainControl(method = "cv", number = 10),
  tuneLength = 20)

ggplot(mdl_caret)

25) Question:

Based on the complexity plot threshold, what size of tree is recommended?

26) rpart variable importance plot

vip(mdl_caret, num_features = 40, bar = FALSE)

# Construct partial dependence plots
p1 <- partial(mdl_caret, pred.var = "lat") %>% autoplot()
p2 <- partial(mdl_caret, pred.var = "WC_bio2") %>% autoplot()
p3 <- partial(mdl_caret, pred.var = c("lat", "WC_bio2")) %>% 
  plotPartial(levelplot = FALSE, zlab = "yhat", drape = TRUE, 
              colorkey = TRUE, screen = list(z = -20, x = -60))

# Display plots side by side
gridExtra::grid.arrange(p1, p2, p3, ncol = 3)

27) Question:

what are the top 3 most important variables of your model?

28) RandomForest variable importance

# number of features
n_features <- length(setdiff(names(d_train), "present"))

# fit a default random forest model
mdl_rf <- ranger(present ~ ., data = d_train)

# get out of the box RMSE
(default_rmse <- sqrt(mdl_rf$prediction.error))
## [1] 0.1922056
# re-run model with impurity-based variable importance
mdl_impurity <- ranger(
  present ~ ., data = d_train,
  importance = "impurity")

# re-run model with permutation-based variable importance
mdl_permutation <- ranger(
  present ~ ., data = d_train,
  importance = "permutation")
p1 <- vip::vip(mdl_impurity, bar = FALSE)
p2 <- vip::vip(mdl_permutation, bar = FALSE)

gridExtra::grid.arrange(p1, p2, nrow = 1)

1d.EVALUATE